{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用torchtext库处理数据\n",
    "参考：[Torchtext 详细介绍 Part.1](https://zhuanlan.zhihu.com/p/37223078)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import spacy\n",
    "import torchtext\n",
    "from torchtext import data\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The sequel, Yes, Prime Minister, ran from 1986 to 1988. In total there were 38 episodes, of which all but one lasted half an hour. Almost all episodes ended with a variation of the title of the series spoken as the answer to a question posed by the same character, Jim Hacker. Several episodes were adapted for BBC Radio, and a stage play was produced in 2010, the latter leading to a new television series on UKTV Gold in 2013.\n"
     ]
    }
   ],
   "source": [
    "text = \"The sequel, Yes, Prime Minister, ran from 1986 to 1988. In total there were 38 episodes, of which all but one lasted half an hour. Almost all episodes ended with a variation of the title of the series spoken as the answer to a question posed by the same character, Jim Hacker. Several episodes were adapted for BBC Radio, and a stage play was produced in 2010, the latter leading to a new television series on UKTV Gold in 2013.\"\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.定义token函数，并创建Field"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['The', 'sequel', ',', 'Yes', ',', 'Prime', 'Minister', ',', 'ran', 'from']\n"
     ]
    }
   ],
   "source": [
    "spacy_en = spacy.load('en')          # 导入英语模型\n",
    "def tokenizer(text):                 # tokenizer函数\n",
    "    \"\"\"\n",
    "    func: 返回一个token的列表\n",
    "    \"\"\"\n",
    "    return [token.text for token in spacy_en.tokenizer(text)]\n",
    "#     return [token.text for token in spacy_en(text)]\n",
    "toke = tokenizer(text)\n",
    "print(toke[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)\n",
    "LABEL = data.Field(sequential=False, use_vocab=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**如果label是整型，不需要numericalize，需要设置use_vocab=False**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 加载语料库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = os.path.join('..', 'data', 'text')\n",
    "train, val, test = data.TabularDataset.splits(\n",
    "    path=data_path, train='train.csv', validation='val.csv',test='test.csv', format='csv',\n",
    "    fields=[('text', TEXT), ('labels', LABEL)]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['examples', 'fields'])"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TEXT.build_vocab(train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 使用批次读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_iter, val_iter, test_iter = data.Iterator.splits(\n",
    "    (train, val, test), sort_key=lambda x:len(x.text),\n",
    "    batch_size=(10,3,1), device=-1              # cpu:-1 gpu:0\n",
    ")\n",
    "train_loader = iter(train_iter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list(train_iter)                              # ???????"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(train_iter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
